package com.webull.information.center.carwler.common.util.jsoup.prase_cn;

import java.util.Locale;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.RandomStringUtils;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;

import com.webull.framework.util.UtilDate;
import com.webull.information.center.carwler.common.model.NewsInformation;
import com.webull.information.center.carwler.common.util.jsoup.HtmlBodyPrase;
import com.webull.information.center.common.constants.Constants;

/**
 * 新浪财经频道网页解析,
 * 
 * @author shimingjun
 * @date 2016年6月28日 下午12:48:17
 * @version 1.0
 * @since JDK 1.8
 */

public class Sina_CNHtmlPrase implements HtmlBodyPrase {
	protected final Logger logger = LogManager.getLogger(getClass());

	@Override
	public void praseNewsInfo(org.jsoup.nodes.Document doc, NewsInformation info) {
		try {

			Optional.ofNullable(doc.getElementById("artibodyTitle"))
					.map(title0 -> StringUtils.stripToNull(title0.ownText()))
					.ifPresent(title1 -> info.setTitle(title1));
			// 21世纪经济报道
			// getSourceName
			Element page_info = Optional.ofNullable(doc.getElementsByClass("page-info").first()).orElse(null);

			Optional.ofNullable(page_info)
					.map(page_info0 -> page_info0.select("span[data-sudaclick*=media_name] a:not(#media_weibo)")
							.first())
					.map(med0 -> StringUtils.stripToNull(med0.ownText())).ifPresent(str0 -> info.setSourceName(str0));
			if (StringUtils.isBlank(info.getSourceName())) {
				Optional.ofNullable(doc.select("span[data-sudaclick*=media_name]").first())
						.map(med0 -> StringUtils.stripToNull(med0.ownText()))
						.ifPresent(str0 -> info.setSourceName(str0));
			}

			// 兼容其他样式
			if (StringUtils.isBlank(info.getSourceName()) || null == info.getNewsTime()) {
				Optional.ofNullable(page_info).map(page_info0 -> page_info0.select("span.time-source").first())
						.map(span0 -> StringUtils.stripToNull(span0.ownText())).ifPresent(source0 -> {
							source0 = (StringEscapeUtils.unescapeHtml4(source0) + "").replaceAll("\\u00A0", " ");
							// 2016年08月25日 09:42:07 中国电子银行网
							source0 = StringUtils.deleteWhitespace(source0);
							Matcher m = Pattern.compile("[^0-9]*$").matcher(source0);
							if (m.find() && StringUtils.isBlank(info.getSourceName())) {
								info.setSourceName(m.group());
							}

							String timeStr = StringUtils.substring(source0, 0, 19);
							info.setPushTime(StringUtils.removePattern(timeStr, "[\u4e00-\u9fa5]*$"));

							if (info.getNewsTime() == null && StringUtils.isNotBlank(timeStr)) {
								try {
									info.setNewsTime(UtilDate.parse(timeStr, Locale.CHINESE, 8, "yyyy年MM月dd日HH:mm"));
								} catch (Exception e) {
								}
							}
						});
			}
			// 兼容其他样式
			if (StringUtils.isBlank(info.getSourceName()) && doc.select("div.artInfo").first() != null) {
				Optional.ofNullable(doc.select("div.artInfo span#pub_date").first())
						.map(dat0 -> StringUtils.stripToNull(dat0.ownText()))//
						.ifPresent(timeStr -> {
							timeStr = (StringEscapeUtils.unescapeHtml4(timeStr) + "").replaceAll("\\u00A0", " ");
							info.setPushTime(StringUtils.removePattern(timeStr, "[\u4e00-\u9fa5]*$"));
							// 2016年08月29日 11:57
							if (info.getNewsTime() == null && StringUtils.isNotBlank(timeStr)) {
								try {
									info.setNewsTime(UtilDate.parse(timeStr, Locale.CHINESE, 8, "yyyy年MM月dd日 HH:mm"));
								} catch (Exception e) {
								}
							}
						});
				Optional.ofNullable(doc.select("div.artInfo span#media_name a:not(#media_weibo)").first())
						.map(med0 -> StringUtils.stripToNull(med0.ownText()))
						.ifPresent(str0 -> info.setSourceName(str0));
				if (StringUtils.isBlank(info.getSourceName())) {
					Optional.ofNullable(doc.select("div.artInfo span#media_name").first())
							.map(med0 -> StringUtils.stripToNull(med0.ownText()))
							.ifPresent(str0 -> info.setSourceName(str0));
				}
			}
			/**
			 * 正文解析
			 */
			Sina_CN_tech_HtmlPrase.processContext1(doc, info);

			if (StringUtils.isBlank(info.getLanguage())) {
				info.setLanguage(Constants.lang_zh);
			}

			if (StringUtils.isBlank(info.getSourceName())) {
				info.setSourceName("新浪网");
			}
		} catch (Exception e) {
			e.printStackTrace();
			logger.warn(e);
		}
	}

	public static void main(String[] args) throws Exception {
		String url2 = "http://finance.sina.com.cn/chanjing/gsnews/20151203/032423916431.shtml";
		// url2 =
		// "http://cj.sina.com.cn/article/detail/2145650153/50337?from=hq";
		// url2 =
		// "http://finance.sina.com.cn/stock/roll/2016-08-22/doc-ifxvcsrm2142516.shtml";
		// url2 =
		// "http://finance.sina.com.cn/stock/usstock/c/2016-08-29/doc-ifxvixeq0647084.shtml";
		// url2 =
		// "http://finance.sina.com.cn/stock/hkstock/ggscyd/20150605/083622356512.shtml";
		// url2 =
		// "http://finance.sina.com.cn/stock/hkstock/hkgg/20151028/141023607071.shtml";
		// url2 =
		// "http://finance.sina.com.cn/stock/hkstock/hkgg/20150921/112523304019.shtml";
		// url2 =
		// "http://finance.sina.com.cn/money/forex/forexroll/2016-08-29/doc-ifxvixer7396489.shtml";
		// url2 =
		// "http://finance.sina.com.cn/stock/s/2016-08-30/doc-ifxvixeq0700618.shtml";
		url2 = "http://finance.sina.com.cn/stock/s/2016-08-29/doc-ifxvitex9231317.shtml";
		// url2 =
		// "http://finance.sina.com.cn/stock/s/2016-08-26/doc-ifxvixer7265364.shtml";
		// url2 =
		// "http://cj.sina.com.cn/article/detail/5835524730/55808?from=hq";
		url2 = "http://finance.sina.com.cn/stock/hkstock/hkgg/2017-02-10/doc-ifyamkzq1225925.shtml?source=cj&dv=1";
		Connection connection = Jsoup.connect(url2)
				.userAgent(
						"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36")
				.header("x-client-data", RandomStringUtils.randomAlphanumeric(40));

		org.jsoup.nodes.Document doc = connection.timeout(10000).get();
		NewsInformation info = new NewsInformation();
		new Sina_CNHtmlPrase().praseNewsInfo(doc, info);
		System.out.println(info);

		// System.out.println(StringUtils.deleteWhitespace("dfsd sdfs 11"));

	}
}
